% read regulatory data
% % This Matlab file reads the regdata, merges it with the BLS industry TFP data, and writes it
%   out to a spreadsheet where we then prepare Table 8 in Stata
% Regdata has several choices of industry data, done in different ways. I focus on the BEA
% data, not the 2-digit or the 3-digit, because that concords best with the BLS MFP/TFP dataset
% This was done at the suggestion of Patrick McLaughlin <pmclaughlin@mercatus.gmu.edu>. Patrick
% also sent the NAICS concordance from their BEA data (which was not, at the time, available on
% the website).
    			



%clear all

% Need a concordance between the Regdata industry numbering, and the BLS industry numbering
% This was prepared by Fernald in a spreadsheet called bea_naics_concordance.xlsx, which 
% matched on NAICS codes

% 1:43,  RegdataBEAList , BLSList

BEA_mapping = {...
[1]	[4]	[23]	'Farms'
[2]	[5]	[24]	'Forestry, fishing, hunting, and related activities'
[3]	[7]	[26]	'Oil and gas extraction'
[4]	[9]	[28]	'Support activities for mining'
[5]	[10]	[29]	'Utilities'
[6]	[11]	[30]	'Construction'
[7]	[15]	[13]	'Nonmetallic mineral product manufacturing'
[8]	[16]	[14]	'Primary metal manufacturing'
[9]	[18]	[16]	'Machinery manufacturing'
[10]	[19]	[17]	'Computer and electronic product manufacturing'
[11]	[20]	[18]	'Electrical equipment, appliance, and component manufacturing'
[12]	[21]	[19]	'Transportation equipment manufacturing'
[13]	[23]	[12]	'Wood product manufacturing'
[14]	[24]	[21]	'Miscellaneous manufacturing'
[15]	[26]	[3]	'Food, beverage and tobacco product manufacturing'
[16]	[27]	[4]	'Textile and textile product mills'
[17]	[29]	[6]	'Paper manufacturing'
[18]	[31]	[8]	'Petroleum and coal products manufacturing'
[19]	[32]	[9]	'Chemical manufacturing'
[20]	[34]	[32]	'Wholesale trade'
[21]	[35]	[33]	'Retail trade'
[22]	[37]	[35]	'Air transportation'
[23]	[38]	[36]	'Rail transportation'
[24]	[39]	[37]	'Water transportation'
[25]	[40]	[38]	'Truck transportation'
[26]	[42]	[40]	'Pipeline transportation'
[27]	[43]	[41]	'Other transportation and support activities'
[28]	[44]	[42]	'Warehousing and storage'
[29]	[48]	[46]	'Broadcasting and telecommunications'
[30]	[52]	[49]	'Credit intermed. and related activities'
[31]	[53]	[50]	'Securities, commods, and other fin. invest. activities'
[32]	[54]	[51]	'Insurance carriers and related activities'
[33]	[55]	[52]	'Funds, trusts, and other financial vehicles'
[34]	[57]	[53]	'Real estate'
[35]	[60]	[127]	'Legal, computer, prof'   %BLS aggregate is one produced by the industry code
[36]	[64]	[59]	'Management of companies and enterprises'
[37]	[67]	[61]	'Waste management and remediation services'
[38]	[69]	[62]	'Education services'
[39]	[71]	[63]	'Ambulatory health care services'
[40]	[73]	[65]	'Social assistance'
[41]	[77]	[67]	'Amusement, gambling, and recreation industries'
[42]	[81]	[70]	'Other services'
[43]	[82]	[8888]	' Government'
 } ; 


BEA_concordance = cell2mat(BEA_mapping(:,1:3));

%cd 'C:\Users\l1jgf01\Documents\Papers\BPEA_Hall, Stock, Watson\Data\Regulation' 
%addpath('C:\Documents and Settings\l1jgf01\My Documents\library' );

format short g
fileloc = 'regdata\'; % fill in if move files
dataloc = [fileloc 'regdata_by_3_digit_industry.xlsx'] ;


[Num, Txt] = xlsread(dataloc, 'regdata_by_bea_industry') ;
sortcol = 2; % will sort by industry 

 
% Want to rearrange to have data by industry, not by year 
[Y, index ] = sort(Num(:,sortcol)); % sort by industry;

add_codes(:,[1 2 4 5]) = Num(index,:) ; % Omits #3 which will be the BLS grouping, added below. Now sorted by industry
% Now fill in column 3, the BLS number, from the concordance
for jj = 1:size(add_codes,1) % loop through the rows
   code = add_codes(jj,2)   ; % code to match
   add_codes(jj,3) = BEA_concordance( find(BEA_concordance(:,2)==code),3)  ; % match code from concordance
end

stackdata = add_codes;

yrstart = 1970;
yrend = 2014;
numyears = yrend-yrstart+1;
industries = size(stackdata,1)/numyears; %
priv_ind_index = 1:42;   % since private industries are just a subset

data = NaN(numyears, size(Num,2)+1, industries+1);

% unstack the data from 2D to 3D, 3rd dimension is industry
for ii = 1:industries;
    range = (1:numyears)+numyears*(ii-1) ;
    data(:,:,ii) = stackdata( range,:) ;
end

data(:,:,end) = sum(data(:,:,priv_ind_index),3) ;
data(:,3,end) = 8888;  % call 8888 for private industry sum total
data(:,1,end) =data(:,1,end)/size(priv_ind_index, 2) ; % years



% stack the 3D matrices (original industries, plus aggregates)

growth_rates = NaN(numyears-1, size(data,2), size(data,3) ) ;

for ii=1:size(data,3)
    growth_rates(:,1:3,ii) = data(2:end,1:3,ii); % year and industry labels
    growth_rates(:,4:5,ii) = 100*diff(log(data(:,4:5,ii)) ) ;
end

Seventies=(1:find(growth_rates(:,1,1)==1980)) ;
Reagan=(find(growth_rates(:,1,1)==1981):find(growth_rates(:,1,1)==1988) ) ;
GHBush=(find(growth_rates(:,1,1)==1989):find(growth_rates(:,1,1)==1992) ) ;


Clinton=(find(growth_rates(:,1,1)==1993):find(growth_rates(:,1,1)==2000) ) ;
WBush=(find(growth_rates(:,1,1)==2001):find(growth_rates(:,1,1)==2008) ) ;

Obama=(find(growth_rates(:,1,1)==2009):find(growth_rates(:,1,1)==2014) ) ;
Pre2005 = (find(growth_rates(:,1,1)==1996):find(growth_rates(:,1,1)==2004) ) ;
Post2004 = (find(growth_rates(:,1,1)==2005):find(growth_rates(:,1,1)==2014) ) ;

Mean_restrictions = NaN(11, size(growth_rates,3),2);
for ii = 4:5

    for jj = 1:size(growth_rates,3)

        Mean_restrictions(1,jj,ii-3) = growth_rates(1,2,jj);
        Mean_restrictions(2,jj,ii-3) = mean(growth_rates(:,ii,jj) );
        Mean_restrictions(3,jj,ii-3) = mean(growth_rates(Seventies,ii,jj) );
        Mean_restrictions(4,jj,ii-3) = mean(growth_rates(Reagan,ii,jj) );
        Mean_restrictions(5,jj,ii-3) = mean(growth_rates(GHBush,ii,jj) );
        Mean_restrictions(6,jj,ii-3) = mean(growth_rates(Clinton,ii,jj) );
        Mean_restrictions(7,jj,ii-3) = mean(growth_rates(WBush,ii,jj) );
        Mean_restrictions(8,jj,ii-3) = mean(growth_rates(Obama, ii,jj) );  %8
        Mean_restrictions(9,jj,ii-3) = mean(growth_rates(Pre2005,ii,jj) );
        Mean_restrictions(10,jj,ii-3) = mean(growth_rates(Post2004,ii,jj) );
        if jj<size(BEA_mapping,1)  % will exclude final government one
            Mean_restrictions(11,jj,ii-3) = 100*VVAWt_full(BEA_concordance(jj,3),1) ;
        end
    end
end

%[Y index] = sort([
rowlabs = {'industry', 'all_years', 'Seventies', 'Reagan', 'GHBush', 'Clinton', 'W' 'Obama' 'Pre2005' 'Post2004', 'VA Weight'} ;
collabs = [BEA_mapping(1:43,4)' 'all priv (arith)'] ; % column labels
Meanreg1 = addLabels(Mean_restrictions(:,:,1),rowlabs, collabs  )  ;
Meanreg2 = addLabels(Mean_restrictions(:,:,2),rowlabs, collabs ) ;
% Need to do the same thing for rules

[Y, index] =sort(Mean_restrictions(8,:,1)-Mean_restrictions(7,:,1),2, 'descend') ;
Tab_Sorted_by_Increase_after_08=addLabels(Mean_restrictions(:,index,1),rowlabs,collabs(index))    %



%%  Now do panel regressions (ones in paper done in Stata)

%addpath(genpath('C:\Users\l1jgf01\Documents\library\paneldata'))  

% Look at pMunnel.m for suggestions

% Panel FE
% regfe = panel(id,year,y, X, 'fe');
% regfe.ynames = ynames;
% regfe.xnames = xnames;
% estdisp(regfe);

% choose years, etc.

nlags = 5 ;   % 
tfpstart = 1988; % when to start tfp
startdat = tfpstart - nlags;  % When to start the regulatory data
startstack = tfpstart ; % in practice, this is the start date
endstack=2014; % this is the end date

% Regression_list will be BLS numbers (e.g., to apply to dTFPv
Regression_list = BEA_concordance(1:end-1,3)'; % All industries, John's BLS numbering. Row vector!
%Regression_list = [8 26 29 40]; % energy


% Need the actual index numbers (which start at 1) that correspond. Read off the BEA
% concordance
Regression_numbers =NaN(size(Regression_list));
for i = 1:size(Regression_list,2)
   Regression_numbers(i) = BEA_concordance(find(BEA_concordance(:,3)==Regression_list(i) ), 1) ;
   % E.g., if Regression_list(i) = 33, looking in the 3rd column of BEA_concordance, that's row
   % 20. So then choose the first column (which is 1:43), row 20.
end

start_index = find(growth_rates(:,1,1) ==startstack) ; % reg growth starts 1971
end_index  =  find(growth_rates(:,1,1) ==endstack)   ;

growth_rates_L0 = growth_rates(:,:, Regression_numbers);

growth_rates_L1 = NaN(size(growth_rates_L0)); % all the same size
growth_rates_L2 = NaN(size(growth_rates_L0)) ;
growth_rates_L3 = NaN(size(growth_rates_L0)) ;
growth_rates_L4 = NaN(size(growth_rates_L0)) ;
growth_rates_L5 = NaN(size(growth_rates_L0)) ;

for ii = 1:size(growth_rates_L0,3)  % will drop last two
    jj = Regression_numbers(ii);
    growth_rates_L1(:,:,ii)=lagmatrix(growth_rates(:,:,jj),1 ); % have to do it sheet by sheet
    growth_rates_L2(:,:,ii)=lagmatrix(growth_rates(:,:,jj),2 );
    growth_rates_L3(:,:,ii)=lagmatrix(growth_rates(:,:,jj),3 );
    growth_rates_L4(:,:,ii)=lagmatrix(growth_rates(:,:,jj),4 );
    growth_rates_L5(:,:,ii)=lagmatrix(growth_rates(:,:,jj),5 );
end

% Now stack the 3D matrices into 2D. A permutation then reshape works (found online)
%BB0 = permute(growth_rates_L0, [1 3 2]); % drop last two
%full_growth_L0 = reshape(BB0,[],size(temp,2),1)  ;% stacked!  It works!
for jj = 0:nlags
    eval( [ 'Temp = permute(growth_rates_L' num2str(jj) ', [1 3 2] );' ] )
    Temp = reshape(Temp,[],size(growth_rates_L0,2),1)  ; %stacked matrix
    eval([ 'full_growth_L' num2str(jj) ' = Temp ;'])    % rename
end
%     

% Now get the years we want
index = full_growth_L0(:,1) >= startstack;

growth_L0 = full_growth_L0(index,:) ;
growth_L1 = full_growth_L1(index,:) ;
growth_L2 = full_growth_L2(index,:) ;
growth_L3 = full_growth_L3(index,:) ;
growth_L4 = full_growth_L4(index,:) ;
growth_L5 = full_growth_L5(index,:) ;

year = growth_L0(:,1);
id   = growth_L0(:,3);

restrict = growth_L0(:,4);
words    = growth_L0(:,5);

restrict_L1 = growth_L1(:,4);
words_L1    = growth_L1(:,5);

restrict_L2 = growth_L2(:,4);
words_L2    = growth_L2(:,5);

restrict_L3 = growth_L3(:,4);
words_L3    = growth_L3(:,5);

restrict_L4 = growth_L4(:,4);
words_L4    = growth_L4(:,5);

restrict_L5 = growth_L5(:,4);
words_L5    = growth_L5(:,5);

restrict_3yr = mean([restrict_L1 restrict_L2 restrict_L3],2);
restrict_5yr = mean([restrict_L1 restrict_L2 restrict_L3  restrict_L4 restrict_L5],2)  ;

assert(all(growth_L1(:,3) == id)) 

% Need to get the rows of TFP data and then stack them
% Want to add the industry in
tfp_growth_start=find((MFPstartY+1:MFPendY)==startstack) ; % will be 1+nlags (one lost in calc growth)
tfp_growth_end=find((MFPstartY+1:MFPendY)==endstack) ;

dTFPv_data = dTFPv(tfp_growth_start:tfp_growth_end, Regression_list ) ; % Now this is Regression_list, not _numbers

dY_data = dY(tfp_growth_start:tfp_growth_end, Regression_list ) ; % Now this is Regression_list, not _numbers
dv_data = dv_new(tfp_growth_start:tfp_growth_end, Regression_list ) ; % Now this is Regression_list, not _numbers
dHrs_data = dHrs(tfp_growth_start:tfp_growth_end, Regression_list ) ; % Now this is Regression_list, not _numbers



dTFPv_stack = reshape(dTFPv_data,numel(dTFPv_data),1);
dY_stack = reshape(dY_data,numel(dTFPv_data),1);
dv_stack = reshape(dv_data,numel(dTFPv_data),1);
dHrs_stack = reshape(dHrs_data,numel(dTFPv_data),1);

file_to_write = ['out' filesep 'data_for_panel.v2.xlsx']
writeout= addLabels([growth_L0(:,[1 3 ]) dTFPv_stack dY_stack dv_stack dHrs_stack],{},{'year', 'BLS industry' 'dTFPv' 'dY' 'dval_add' 'dhours' } ) ;
xlswrite(file_to_write,writeout, 'dTFPv')
writeout_long_rest = addLabels(full_growth_L0, {},{'year', 'Regdata_industry', 'BLS industry' 'restrict_L0', 'words_L0'})
xlswrite(file_to_write,writeout_long_rest, 'regulatory')

y = dHrs_stack;%dTFPv_stack;
% Choose one
X = [restrict restrict_L1 restrict_L2 restrict_L3 restrict_L4 restrict_L5];
X = [words words_L1 words_L2 words_L3 words_L4 words_L5];
X = [restrict_3yr];

ynames = {'dTFPv'};
xnames = {'restrict','restrict_L1','restrict_L2', 'restrict_L3','restrict_L4','restrict_L5', 'CONST'}; % Constant will be added

xnames = {'restrict3yr', 'Const'}

% OLS
regols = ols(y,X);
regols.ynames = ynames;
regols.xnames = xnames;
estdisp(regols);

% Clustered OLS
regolsc = ols(y,X,'vartype','cluster','clusterid',id);
regolsc.ynames = ynames;
regolsc.xnames = xnames;
estdisp(regolsc);

% Panel FE
regfe = panel(id,year,y, X, 'fe');
regfe.ynames = ynames;
regfe.xnames = xnames;
estdisp(regfe);

% Panel FE with clustered residuals
regfe = panel(id,year,y, X, 'fe','vartype','cluster','clusterid',id);
regfe.ynames = ynames;
regfe.xnames = xnames;
estdisp(regfe);


% 
% 
% % Panel BE
% regbe = panel(id,year,y, X, 'be');
% regbe.ynames = ynames;
% regbe.xnames = xnames;
% estdisp(regbe);
% 
% % Panel RE
% regre = panel(id,year,y, X, 're');
% regre.ynames = ynames;
% regre.xnames = xnames;
% estdisp(regre);
% 
% % F test of inividual effects
% effF = effectsftest(regfe);
% testdisp(effF);
% 
% % BP test for effects
% bpre = bpretest(regre);
% testdisp(bpre);
% 
% % Hausman test
% h = hausmantest(regfe, regre);
% testdisp(h);
% 
% % Mundlak test
% mu = mundlakvatest(regfe);
% testdisp(mu);
% 
% % Pool test
% po = pooltest(regfe);
% testdisp(po);
% 
% % Wooldridge serial test
% wo = woolserialtest(regfe);
% testdisp(wo);
% 
% wo = woolserialtest(regfe,'dfcorrection',0);
% testdisp(wo);
% 
% % Baltagi Li serial
% bl = blserialtest(regre);
% testdisp(bl)
% 
% % Pesaran CSD
% pecsdfe = pesarancsdtest(regfe);
% testdisp(pecsdfe);
% 
% pecsdre = pesarancsdtest(regre);
% testdisp(pecsdre);
% 
% 
% % Panel FE Robust
% regfer = panel(id, year, y, X, 'fe', 'vartype', 'robust');
% regfer.ynames = ynames;
% regfer.xnames = xnames;
% estdisp(regfer);
% 
% % Panel RE Robust
% regrer = panel(id, year, y, X, 're', 'vartype', 'robust');
% regrer.ynames = ynames;
% regrer.xnames = xnames;
% estdisp(regrer);
% 
% % Mundlak test
% mur = mundlakvatest(regfer);
% testdisp(mur);
% 
% % Individual effects
% ieffectsdisp(regfer);
% 
% % Individual effects
% ieffectsdisp(regfer,'overall');
% 


% If repeats, try the following
% [aaa I J]=unique([id year],'rows', 'first') ;
% hasDuplicates = size(aaa,1) < size([id],1) 
% ixDupRows = setdiff(1:size([id year],1), I)
% dupRowValues = year(ixDupRows,:)



% To do's: 
% add lags? Probably need a little function to do it.  
% E.g., send in startyear, and lags, and it will give me a matrix
% Do I call by years (as in the "year" dimension)? I think so.  


